knitr::opts_chunk$set(echo = TRUE,
                      eval = FALSE,# I already ran and saved the data in data folder
                      warning = FALSE,
                      message = FALSE,
                      error = FALSE)

Set-up

# loading packages
library(rvest) # for web scraping functions using css tags
library(tidyverse) # for basic data wrangling
library(polite) # for checking website permissions
library(janitor) # for clean column names
library(RSelenium) # for scraping dynamic tables 
library(netstat) # to find free port

The main results page on Butte to Butte website looks as this:

knitr::include_graphics(here::here("images/b2b_results_page.png"))

There are hyperlinks embedded on the page for each race year which redirects to the annual race pages. First, I will extract all the year-event page links.

Scraping static results from pages made using EER style

Process:

knitr::include_graphics(here::here("images/eer_table_extract.gif"))

extract_table_eer <- function(link, year){

  # check permissions
    year_page_session <- bow(link,
                       user_agent = "HK result scraping",
                       force = TRUE)
    
  # extract hyperlinks specific to results page - second on the each year's tab
    year_results_link <- scrape(year_page_session) %>%
                            html_element(".nav-tabs") %>%
                            html_elements("li") %>%
                            html_elements("a") %>%
                            #html_text() 
                            html_attr("href") %>%
                            .[2]
    
  # extract overall-results links for each format type
    
    year_results_session <- year_page_session %>%
        nod(path = year_results_link)
    
    race_codes <- tibble(
        race_format =  scrape(year_results_session) %>%
                                     html_element(".tabs") %>%
                                     html_elements("p") %>%
                                     html_text(),
        race_link_code = scrape(year_results_session) %>%
                           html_element(".tabs") %>%
                           html_elements("p") %>%
                           html_attr("race-code"),
        race_links = paste0(year_results_link,"/",race_link_code))
    
    results_data <- map2_df(race_codes$race_format, race_codes$race_links,
                            ~year_results_session %>%
                                nod(path = .y) %>%
                                scrape(.) %>%
                                html_element("table") %>%
                                html_table() %>%
                                clean_names() %>%
                                # removing PII information
                                select(-c(x, name, bib, age)) %>%
                                mutate(
                                    race_format = .x,
                                    race_year = year
                                ))
    
    return(results_data)
}
# scrape eer style pages
eer_year_results <- b2b_annual_race_links_static %>%
    filter(html_design == "eer") 

eer_year_results <- map2_df(eer_year_results$link, eer_year_results$year, 
                            ~extract_table_eer(.x, .y))

Scraping static results from pages made using ORR style

Process:

knitr::include_graphics(here::here("images/orr_text_table_extract.gif"))

extract_table_orr <- function(link, year){

  # check permissions
    year_page_session <- bow(link,
                       user_agent = "HK result scraping",
                       force = TRUE)
    
  # extract hyperlinks specific to results page - ending with Overall - Text Results
    year_event_results_links <- tibble(
                description = scrape(year_page_session) %>%
                                html_element("body") %>%
                                html_element("main") %>%
                                html_elements("a") %>%
                                html_text(),
                links =  scrape(year_page_session) %>%
                        html_element("body") %>%
                        html_element("main") %>%
                        html_elements("a") %>%
                        html_attr('href')) %>%
          filter(grepl("Text", description)) %>%
          # links have relative path - adding the first part for full address
          mutate(links = paste0("https://onlineraceresults.com", links))
        
    
  # extract text files from each event-format page
    
    format_results_text_files <- map(year_event_results_links$links,
                                     ~ year_page_session %>%
                                         nod(path = .x) %>%
                                         scrape(.) %>%
                                         html_element("pre") %>%
                                         html_text() %>%
                                         gsub("(.*)(Place )(.*)", "\\2\\3", .) %>%
                                         gsub("\r", "", .) %>%
                                         gsub("/", "_", .) %>%
                                         str_split(., "\n") %>% 
                                         unlist()
    )
                                                                                 
  # convert text file to dataframe

  # initialize event_data inputs
    results_data <- vector(mode = "list", length = length(format_results_text_files))
    
    for(i in seq_along(format_results_text_files)){
    # count columns width in the dataframe using "=" signs
        
    column_size <- str_split(format_results_text_files[[i]][2], "= ") %>% 
        unlist() 
    column_size <- map_dbl(column_size[1:(length(column_size)-1)],
                           ~str_count(.x,"=")+2)
    column_size <- cumsum(column_size)
    column_size <- c(1, column_size)
    
    # store column names
    column_names <- vector(mode = "character", length = length(column_size) - 1)
    
    for (j in 2:length(column_size)){
        column_names[j - 1] <- substr(format_results_text_files[[i]][1],
                                      column_size[j-1], column_size[j]) %>%
        gsub(" ","",.)
    }

    # make a dataframe to store each row's values
    results_data[[i]] <- tibble(
                    column_names,
                    dummy_val = 1
                ) %>%
                    pivot_wider(
                        names_from = column_names,
                        values_from = dummy_val
                    )

    # initialize row inputs
    row_value <- vector(mode = "character", length = length(column_size) - 1)
    
    for(j in 3:length(format_results_text_files[[i]])){
        
    row_value <- map_chr(2:length(column_size), ~substr(format_results_text_files[[i]][j], column_size[.x-1], column_size[.x]) %>%
        gsub(" ","",.))
    
    results_data[[i]] <- rbind(results_data[[i]], row_value)}
}
  
    # add event detail and year to each data file
    # then reduce to a single data frame
    
    results_data <- map2(results_data, 
                         year_event_results_links$description, ~.x %>%
                          mutate(race_format = gsub("(.*)( Overall - Text Format)", "\\1", .y),
                       race_year = year) %>%
                    clean_names() %>%
        filter(place!= "", name != "1") %>%
        select(-c(name, ag, no))) %>%
        reduce(., bind_rows)
    
    return(results_data)
}
# scrape orr style pages
orr_year_results <- b2b_annual_race_links_static %>%
    filter(html_design == "orr") 

orr_year_results <- map2_df(orr_year_results$link, orr_year_results$year, 
                            ~extract_table_orr(.x, .y))

Scraping dynamic pages

Process:

knitr::include_graphics(here::here("images/dynamic_table_extract.gif"))

# get result-links to loop through
# within each link, the steps are to click see all results, see all rows, and scrape the webpage

extract_hyperlinks <- function(link, year){

  # check permissions
    year_page_session <- bow(link,
                       user_agent = "HK result scraping",
                       force = TRUE)
    
  # extract hyperlinks specific to results page - ending with Overall - Text Results
    year_event_results_links <- tibble(
                description = scrape(year_page_session) %>%
                                html_elements(xpath = '//*[(@id = "orr-event-races")]') %>%
                                html_elements("a") %>%
                                html_text(),
                links =  scrape(year_page_session) %>%
                        html_elements(xpath = '//*[(@id = "orr-event-races")]') %>%
                        html_elements("a") %>%
                        html_attr('href'),
                nrow = length(description),
                further_filter = ifelse(nrow > 2, 1, 0),
                link_to_keep = ifelse(further_filter == 1 & grepl("Overall", description), 1, 0),
                year = year
                ) %>%
          filter(further_filter == 0 | (further_filter == 1 & link_to_keep == 1)) %>%
          # links have relative path - adding the first part for full address
          mutate(links = paste0("https://onlineraceresults.com", links))
    
    return(year_event_results_links)
}
dynamic_hyperlinks <- map2_df(b2b_annual_race_links_dynamic$link,
                               b2b_annual_race_links_dynamic$year,
                               ~extract_hyperlinks(.x, .y))
# set-up driver_object and start browser
rs_driver_object <- rsDriver(
    browser = "chrome",
    chromever = "126.0.6478.126",
    verbose = TRUE,
    port = netstat::free_port() 
)

# set up the client side
rs_driver_client <- rs_driver_object$client

Now, we will navigate to each website from out hyperlink page. Then, we will click show all results, show all rows, and scrape the page. We will

extract_table_orr_dynamic <- function(link, year, format){
    # navigate to the page
    rs_driver_client$navigate(link)
    
    # click show all results
    show_all_results <- rs_driver_client$findElement(using = 'xpath', '//*[(@id = "orr-race-results-search")]//li[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//a')
    
    show_all_results$clickElement()
    
    # click show all rows
    show_all_rows <- rs_driver_client$findElement(using = 'xpath', '//*[(@id = "orr-main-event-race-results")]//a[(((count(preceding-sibling::*) + 1) = 5) and parent::*)]')

    show_all_rows$clickElement()
    
    # get page source and scrape page
    table_location <- rs_driver_client$findElement(using = 'id',
                                      'race-results-table')

    table_html <- table_location$getPageSource()
    
    # extract table
    table_df <-  read_html(table_html %>% unlist()) %>%
                    # it's the third table
                    html_table() 
    
    # it's either the third or the first. Earleir years have pages that only have one table
    results_table_index <- ifelse(length(table_df) > 1, 3, 1)
    
    table_df <- table_df %>%
                    .[[results_table_index]] %>%
                    clean_names() %>%
                    # drop PII and irrelevant cols
                    select(-c(x, x_2, fn, ln, no)) %>%
                    mutate(race_format = format,
                           race_year = year)
    
    
    return(table_df)
}
# loop through hyperlinks and get the tables
dynamic_table_results <- pmap(list(dynamic_hyperlinks$links,
                                   dynamic_hyperlinks$year,
                                   dynamic_hyperlinks$description),
                              ~extract_table_orr_dynamic(..1, ..2, ..3))

So satisfying when it magically works!

knitr::include_graphics(here::here("images/dynamic_scraping.gif"))

# close dynamic browser
rs_driver_client$closeall()
#collapse list elements into one dataframe
dynamic_year_results <- reduce(dynamic_table_results, bind_rows)

Join results of all formats

# harmonizing column names
str(eer_year_results)
str(orr_year_results)
str(dynamic_year_results)

Just realized there is no variable ‘state’ in the tables I scraped dynamically - which was kind of the whole point. For now, I’m harmonizing the files and might come back to the project to fix the state information.

# changing colnames and col style to err_year_results style
orr_year_results <- orr_year_results %>%
    rename("sex" = "s",
           "state" = "st",
           "gun_time" = "time") %>% # assuming it will be the guntime since rfid wasn't too common a couple years ago
    mutate(div_place = gsub("(\\d+)(_)(\\d+)", "\\1", div_tot),
           div_total = gsub("(\\d+)(_)(\\d+)", "\\3", div_tot),
           div = gsub("([A-Z]\\d{2})(\\d{2})", "\\1-\\2", div),
           across(c(place, div_place, div_total), as.integer)
           ) %>%
    select(-div_tot)

dynamic_year_results <- dynamic_year_results %>%
    rename("place" = "overall",
           "div"= "division",
           "gun_time" = "time") %>% #same assumption
    mutate(div_place = gsub("(\\d+)(/)(\\d+)", "\\1", divpl),
           div_total = gsub("(\\d+)(/)(\\d+)", "\\3", divpl),
           sex_place = gsub("(\\d+)(/)(\\d+)", "\\1", sexpl),
           sex_total = gsub("(\\d+)(/)(\\d+)", "\\3", sexpl),
           div = gsub("([A-Z]\\d{2})(\\d{2})", "\\1-\\2", div),
           sex = gsub("([A-Z])(.*)", "\\1", div),
           race_format = gsub(" Overall| Run| Mayor's", "", race_format),
           across(c(place, div_place, div_total, sex_place, sex_total), as.integer)) %>%
    select(-c(divpl, sexpl))
#changing race_format from 4 Mile Walk to 4M Walk in eer_year_results to be consistent
eer_year_results <- eer_year_results %>%
    mutate(race_format = gsub(" Mile","M", race_format))
#join
all_year_results <- bind_rows(eer_year_results, orr_year_results, dynamic_year_results)
write.csv(all_year_results, "data/butte_to_butte_results_2000_2024.csv")